home *** CD-ROM | disk | FTP | other *** search
- ; SCDB:
- ; The easiest way to build an assembly language module is to let the compiler
- ; do it, and hand-optimize from there. This program is a particularly dramatic
- ; case of the 90/10 rule: 90% of the time is spent in 10% of the code.
- ; In this program, it's more like 99/1. However, we pay a
- ; price for my laziness in starting with the compiler output, and
- ; that is that the branch points have cryptic names line ".10004".
- ; C'est la vie. (I bet I didn't spell that right. I studied German,
- ; not French.)
- ;
- ; The Aztec compiler actually has a pretty good optimizer for normal code in
- ; the regular CPU. However, the native floating point code it created was
- ; MISERABLE. The Aztec compiler spent excessive amounts of time moving values
- ; onto and off of the stack, and only used fp0. There are actually 8
- ; floating point registers, which means we can use most of them as
- ; "register double" variables, not competing with pointer or integer variables
- ; in the regular CPU. (I have to consider this a major flaw in the compiler.)
- ; Manually, therefore, I do exactly that as follows:
- ;
- ; fp7 = x1
- ; fp6 = y1
- ; fp5 = x2
- ; fp4 = y2
- ; fp3 = x_squared
- ; fp2 = y_squared
- ;
- ; The values "cx" and "cy" still come off the stack, but they should be
- ; cache-hits, so they ought to run pretty fast.
- ;
- ; fp0 is used for certain intermediate calculations. fp1 contains the value
- ; above which the loop terminates. For a standard Mandelbrot diagram, this value
- ; is 4.0, but it occurs to me that that value is rather arbitrary, so
- ; I'm putting it into fp1 so that we can play games with it. Just what happens
- ; if that value is changed? Beats heck out of me, but it might be interestng
- ; to find out... This is therefore set up to load this value from
- ; a double variable from the C code.
- ;
- ; To the extent that I understand it, I'm also trying to optimize this
- ; to take advantage of the 68882 concurrency capability. This is completely
- ; compatible with a 68881 - it just runs less rapidly since the '881 has no
- ; such concurrency capability. (On the '882, certain FMOVE.D operations can
- ; take place simultaneously with real operations and therefore take no
- ; excess time.)
- ;
- ; On a 68040, this should all run out of cache, which will speed it up still
- ; further. It may or may not fit into the 68030 cache. Someone out there
- ; who owns an '030 may wish to pursue this.
- ;
- ; To cut down on excessive FMOVE.D commands, the loop is unrolled into a double
- ; cycle (as per recommendations in the 68882 technical manual from Motorola).
- ; This costs a miniscule amount of space but saves a miniscule amount of time
- ; per loop, multiplied by millions of loops per picture. I consider this a
- ; very good trade. If I can trade a kilobyte for 5 seconds, that's a good deal,
- ; and the improvement is considerably better than that for some pictures.
- ;
- ; This code is in format for the Aztec assembler. Your assembler may vary
- ; considerably from this. The C-code given in comments below this point does
- ; NOT match that from the program I began with. What you're looking at is
- ; the C-code I ended up with after I did as much optimization as I could at
- ; the C-code level (including unwrapping the loop, and eliminating redundant
- ; calculation of "x_squared" and "y_squared").
- ;
- ; Code lines in lower case are retained from Aztec. Upper case is mine.
- ;
- ;:ts=8
- far code
- far data
- machine mc68020
- mc68881
- ;#include <math.h>
- ;extern short max_orbits;
- ;extern double threshold;
- ;
- ;/* Test_Point() tests points in the C plane to determine
- ; whether they orbit stably or escape to infinity.
- ; Returns the escape velocity or -1 if the point is
- ; in the M-set. */
- ;
- ;short int Test_Point(double cx, double cy)
- ;{
- xdef _Test_Point
- _Test_Point:
- link a5,#.2
- movem.l d2,-(sp)
-
- ; SCDB: By definition, "x" and "y" begin at zero at the first loop.
- ; Therefore, by definition, "x_squared" and "y_squared" also start at 0.
- ;
- ; double x1 = 0.0, y1 = 0.0;
- ; double x2, y2;
- ; register short int i;
- ; double x_squared = 0.0, y_squared = 0.0;
- ;
- ; i = max_orbits;
- ;
- ; SCDB: Aztec makes "int" a long, but for our purposes we'll assume it is
- ; a short. The only case in which this is a problem is if "/t" in the regular
- ; code exceeds 126, in which case the picture will probably take a year to
- ; calculate anyway. The following move.w therefore discards the top 16 bits.
- move.w _max_orbits+2,d2
-
- ; SCDB: Load the threshold, nominally "4.0" into 'fp1' for future use:
-
- FMOVE.D _threshold,fp1
-
- ;
- ; The Aztec compiler says that the following constant is "0.0". Who am I to
- ; doubt it?
-
- FMOVE.D #"$0000000000000000",fp7 ; x1 = 0
-
- FMOVE.X fp7,fp6 ; y1 = 0
- FMOVE.X fp7,fp3 ; x_squared = 0
- FMOVE.X fp7,fp2 ; y_squared = 0
-
-
- ; for (;;) {
- ;
- ; y2 = x1*y1;
- .10003
-
- FMOVE.X fp7,fp4 ; y2 = x1
- FMUL.X fp6,fp4 ; * y1
-
- ; y2 = y2 + y2 + cy;
-
- FADD.X fp4,fp4 ; y2 = y2 + y2
- FADD.D 16(A5),fp4 ; + cy
-
- ; x2 = x_squared - y_squared + cx;
-
- FMOVE.X fp3,fp5 ; x2 = x_squared
- FSUB.X fp2,fp5 ; - y_squared
- FADD.D 8(A5),fp5 ; + cx
-
- ; if ((x_squared=(x2*x2)) + (y_squared=(y2*y2)) > threshold) {
-
- FMOVE.X fp5,fp3 ; x_squared = x2
- FMUL.X fp5,fp3 ; * x2
-
- FMOVE.X fp4,fp2 ; y_squared = y2
- FMUL.X fp4,fp2 ; * y2
-
- FMOVE.X fp3,fp0 ; temp = x_squared
- FADD.X fp2,fp0 ; + y_squared
-
- fcmp.x fp1,fp0
- fble .10004 ; Branch if we're NOT done
-
- ; fall through if we ARE done
-
- ; return(max_orbits - i);
-
- move.l _max_orbits,d0
- move.w d2,a0
- sub.l a0,d0
-
- .3
- movem.l (sp)+,d2
- unlk a5
- rts
- ; }
-
-
- ; if ((--i)<=0)
- ; return(-1);
- .10004
- sub.w #1,d2
-
- bgt .10005
-
- move.l #-1,d0
- bra .3
-
-
- ;
- ; y1 = x2*y2
- ;
-
- .10005
-
- FMOVE.X fp5,fp6 ;y1 = x2
- FMUL.X fp4,fp6 ; * y2
-
-
- ; y1 = y1 + y1 + cy;
- ;
-
- FADD.X fp6,fp6 ; y1 = y1 + y1
- FADD.D 16(a5),fp6 ; + cy
-
- ; x1 = x_squared - y_squared + cx;
-
- FMOVE.X fp3,fp7 ; x1 = x_squared
- FSUB.X fp2,fp7 ; - y_squared
- FADD.D 8(a5),fp7 ; + cx
-
- ; if ((x_squared=(x1*x1)) + (y_squared=(y1*y1)) > threshold) {
-
- FMOVE.X fp7,fp3 ; x_squared = x1
- FMUL.X fp7,fp3 ; * x1
-
- FMOVE.X fp6,fp2 ; y_squared = y1
- FMUL.X fp6,fp2 ; * y1
-
- FMOVE.X fp3,fp0 ; temp = x_squared
- FADD.X fp2,fp0 ; + y_squared
-
- FCMP.X fp1,fp0
- fble .10006
-
- ; return(max_orbits - i);
-
- move.l _max_orbits,d0
- move.w d2,a0
- sub.l a0,d0
- bra .3
- ; }
-
- .10006
- ; if ((--i)<=0)
- ; return(-1);
-
- sub.w #1,d2
-
- bgt .10007
- move.l #-1,d0
- bra .3
- ; }
- .10007
- .10001
- bra .10003
- ;}
- .2 equ -48
- ;
- xref .begin
- dseg
- xref _max_orbits
- xref _threshold
- end
-
-